In [1]:
%matplotlib inline
from matplotlib import pyplot as plt
Next, we set the working directory...
In [2]:
import os
working_dir = '~/Data/marcelle-hcs-sample/features'
os.chdir(os.path.expanduser(working_dir))
In [3]:
import paramiko as mk
with open('dirs.txt', 'r') as fin:
line = fin.readline().rstrip()
dirs = line.split(' ')
from husc.screens import myofusion as myo
reload(myo)
remote_base_dir = 'marcelle/raw-data'
plateids = map(myo.dir2plate, dirs)
feature_fns = ['features-%i.h5' % p for p in plateids]
# set up SFTP transport to fetch image files
trans = mk.Transport(('merri.vlsci.unimelb.edu.au', 22))
trans.connect(pkey=mk.Agent().keys[0], username='jni')
sftp = mk.SFTPClient.from_transport(trans)
#for d, f in zip(dirs, feature_fns):
# if not os.path.exists(f):
# sftp.get(os.path.join(remote_base_dir, d, 'features.h5'), f)
The above only needs to be run once, so we store the run code here:
for d, f in zip(dirs, feature_fns):
if not os.path.exists(f): # allow resumption of interrupted scps
time.sleep(1) # merri appears to choke with too many scps
sp.check_call(['scp', 'jni@merri.vlsci.unimelb.edu.au:' +
os.path.join(remote_base_dir, d, 'features.h5'), f])
Let's do a sanity check to ensure plate ids are unique:
In [4]:
print (len(dirs), len(set(dirs)),
len(plateids), len(set(plateids)))
tempdict = {}
for k in plateids:
tempdict.setdefault(k, []).append(k)
print([k for k, v in tempdict.items() if len(v) > 1])
In [5]:
import os
import pandas as pd
import numpy as np
In [6]:
h5_fns = sorted(filter(lambda s: s.startswith('feature-') and
s.endswith('.h5') and not
s.endswith('NOCODE.h5'), os.listdir('.')))
datasets = [pd.read_hdf(f, 'data') for f in h5_fns]
In [7]:
data = pd.concat(datasets)
print(data)
In [8]:
from sklearn.preprocessing import StandardScaler
from sklearn import decomposition
In [9]:
X = StandardScaler().fit_transform(data)
pca = decomposition.PCA().fit(X)
pca.n_components = 2
X_reduced = pca.fit_transform(X)
print(X_reduced.shape)
The obligatory PCA scatter:
In [10]:
plt.scatter(*(X_reduced.T))
Out[10]:
What features contributed most to the two top PCA components?
In [11]:
pca_comp1, pca_comp2 = (np.argsort(pca.components_[0])[-1:-4:-1],
np.argsort(pca.components_[1])[-1:-4:-1])
data.columns[pca_comp1], data.columns[pca_comp2]
Out[11]:
In [12]:
from sklearn import cluster
In [13]:
# average cluster size of 10
avg_cluster_size = 20
K = cluster.MiniBatchKMeans(n_clusters=(X.shape[0] // avg_cluster_size))
K = K.fit(X)
In [14]:
from husc import preprocess as pre
reload(pre)
myores_semantic_filename = myo.myores_semantic_filename
In [15]:
plate2dir = myo.make_plate2dir_dict(dirs)
In [16]:
sems = map(myo.myores_semantic_filename, data.index)
data.index = [(s['plate'], s['well']) for s in sems]
In [17]:
plates = set([ix[0] for ix in data.index])
print plates
In [18]:
from pymongo import MongoClient
collection = MongoClient()["myofusion"]["wells"]
def get_cluster_gene_data(cluster_id):
cluster_members = [data.index[i] for
i in np.flatnonzero(K.labels_ == cluster_id)]
cluster_entries = [collection.find_one({"_id": myo.key2mongo(c)})
for c in cluster_members]
return cluster_members, cluster_entries
In [19]:
print(len(K.labels_), len(np.unique(K.labels_)), np.max(K.labels_))
plt.plot(np.bincount(K.labels_))
Out[19]:
In [20]:
cluster_members_1, cluster_entries_1 = get_cluster_gene_data(17)
print(len(cluster_members_1))
cluster_entries_1[:3]
Out[20]:
We can use the plate and well numbers to grab the images that were analysed.
The database contains the locations of each file in merri. We use our previously established paramiko connection (held by the sftp object) to grab files on request.
In [21]:
import tempfile as tmp
import mahotas as mh
def get_doc_value(mongo_doc, key):
try:
return mongo_doc[key], False
except KeyError:
return "", True
def get_images(key, key_type='_id',
ids_returned=["_id", "gene_name"],
failsafe_ids=["label"]):
if key_type == "_id" and type(key) == tuple:
key = myo.key2mongo(key)
im_entries = collection.find({key_type: key})
ims = []
ids = []
for entry in im_entries:
im_path = entry["filename"]
tmpfile = tmp.mkstemp(suffix='.' + im_path.split('.')[-1],
dir='.')[1]
sftp.get(im_path, tmpfile)
ims.append(mh.imread(tmpfile))
id_list = []
fail = False
for ix in ids_returned:
val, fail = get_doc_value(entry, ix)
id_list.append(val)
if fail:
for ix in failsafe_ids:
id_list.append(get_doc_value(entry, ix)[0])
ids.append(" ".join(map(str, id_list)))
labeled_ims = zip(ids, ims)
return labeled_ims
Now we want to look at the images. The below function shows three images side by side, but can be generalised to show a grid of any number of images.
In [22]:
def show_3ims(label_ims):
num_rows = int(np.ceil(float(len(label_ims)) / 3))
fig, axes = plt.subplots(num_rows, 3, figsize=(15, 5 * num_rows))
for (name, im), ax in zip(label_ims, axes.ravel()):
ax.imshow(im)
ax.set_title(name)
In [23]:
cluster_members_1
Out[23]:
In [24]:
try:
ims = [get_images(k)[0] for k in cluster_members_1]
except:
print([c["filename"] for c in cluster_entries_1])
raise
In [25]:
show_3ims(ims)